import numpy as np
import plotly.express as px
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import umap
from pyensembl import EnsemblRelease
from itertools import product
from Bio.Seq import translate
import pickle
import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
def construct_databases(base_loc):
mappings = {}
for directory in os.listdir(base_loc):
if directory.startswith("."):
continue
loc = base_loc + "/" + directory
t = ibis.read_parquet(loc)
mappings["t_" + directory] = t
return mappings
# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))
def vectorize_and_embed(docs, n_components=3, use_densmap=False,
metric='euclidean', n_neighbors=15, vectorizer_fun=lambda: CountVectorizer(stop_words='english')):
counts = vectorizer_fun().fit_transform(docs)
mapper = umap.UMAP(n_components=n_components, densmap=use_densmap, metric=metric, random_state=42).fit(counts)
return mapper
def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None):
embeddings = mapper.embedding_.T
df["x"], df["y"], df["z"] = embeddings
fig = px.scatter_3d(df, x="x", y="y", z="z", color=color,
hover_name=hover_name, hover_data=hover_data)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.update_traces(marker=dict(size=2))
return fig.show()
# For this demo, only considering protein-coding targets
protein_coding_targets = t_targets.filter(_.biotype == 'protein_coding')
subcell_by_id = (protein_coding_targets
.select("id", _.subcellularLocations.unnest())
.unpack("subcellularLocations")
.filter(~ _.location.startswith("["))
.group_by("id")
.agg(location = _.location.first()))
subcell_by_id
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ location ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ ├─────────────────┼──────────────────────┤ │ ENSG00000076555 │ Mitochondrion │ │ ENSG00000095794 │ Nucleus │ │ ENSG00000102755 │ Endosome │ │ ENSG00000122566 │ Nucleus │ │ ENSG00000135722 │ Golgi apparatus │ │ ENSG00000163638 │ Secreted │ │ ENSG00000169371 │ Nucleus │ │ ENSG00000198040 │ Nucleus │ │ ENSG00000040531 │ Melanosome membrane │ │ ENSG00000136270 │ Mitochondrion matrix │ │ … │ … │ └─────────────────┴──────────────────────┘
go_name_to_gene_id = (protein_coding_targets
.select(id=_.id, go_id=_.go.unnest().id)
.inner_join(t_go, _.go_id == t_go.id)
.group_by("id")
.agg(go_desc = _.name.collect().join(" ")))
go_name_to_gene_id
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ go_desc ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ ├─────────────────┼──────────────────────────────────────────────────────────────────────────────────┤ │ ENSG00000116748 │ IMP salvage AMP deaminase activity metal ion binding IMP biosynthetic process G… │ │ ENSG00000138755 │ defense response to virus positive regulation of release of sequestered calcium… │ │ ENSG00000141519 │ cytoplasm motile cilium assembly flagellated sperm motility cilium axoneme cili… │ │ ENSG00000165118 │ tRNA-guanine transglycosylation molecular_function cellular_component biologica… │ │ ENSG00000166181 │ membrane RNA binding cytoplasm fibroblast apoptotic process negative regulation… │ │ ENSG00000170006 │ membrane protein binding │ │ ENSG00000198203 │ sulfotransferase activity cytoplasm lysosome aryl sulfotransferase activity ami… │ │ ENSG00000214193 │ plasma membrane nucleoplasm cell migration actin filament organization │ │ ENSG00000064199 │ sperm fibrous sheath binding of sperm to zona pellucida external side of plasma… │ │ ENSG00000140326 │ endomembrane system protein localization plasma membrane cytoplasm chromatin or… │ │ … │ … │ └─────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
query = (protein_coding_targets
.left_join(subcell_by_id, "id", rname="cell_id")
.left_join(go_name_to_gene_id, "id", rname="go_id")
.mutate(functionDescriptions=_.functionDescriptions.join(" "))
.select("id", "approvedName", "go_desc", "functionDescriptions", "location")
.mutate(truncDesc = _.functionDescriptions[0:50])
.fillna({"location":"N/A", "functionDescriptions":"N/A", "go_desc":"N/A"})
.filter(_.go_desc != "N/A"
and _.location != "N/A"
and _.functionDescriptions != "N/A"))
query
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ approvedName ┃ go_desc ┃ functionDescriptions ┃ location ┃ truncDesc ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ string │ string │ string │ ├─────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────┼────────────────────────────────────────────────────┤ │ ENSG00000059588 │ TAR (HIV-1) RNA binding protein 1 │ tRNA (guanine) methyltransferase activity regulation of transcription by RNA po… │ Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates R… │ Nuclear speckles │ Probable S-adenosyl-L-methionine-dependent methylt │ │ ENSG00000072071 │ adhesion G protein-coupled receptor L1 │ latrotoxin receptor activity G protein-coupled receptor signaling pathway plasm… │ Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitat… │ Cell membrane │ Calcium-independent receptor of high affinity for │ │ ENSG00000073536 │ notchless homolog 1 │ skeletal system morphogenesis positive regulation of canonical Wnt signaling pa… │ Plays a role in regulating Notch activity. Plays a role in regulating the expre… │ Nucleus │ Plays a role in regulating Notch activity. Plays a │ │ ENSG00000075290 │ Wnt family member 8B │ canonical Wnt signaling pathway signal transduction nervous system development … │ Ligand for members of the frizzled family of seven transmembrane receptors. May… │ Secreted │ Ligand for members of the frizzled family of seven │ │ ENSG00000083454 │ purinergic receptor P2X 5 │ positive regulation of calcium ion transport into cytosol positive regulation o… │ Receptor for ATP that acts as a ligand-gated ion channel. │ Membrane │ Receptor for ATP that acts as a ligand-gated ion c │ │ ENSG00000083782 │ epiphycan │ articular cartilage development glycosaminoglycan binding bone development extr… │ May have a role in bone formation and also in establishing the ordered structur… │ Secreted │ May have a role in bone formation and also in esta │ │ ENSG00000087087 │ serrate, RNA effector molecule │ primary miRNA processing regulation of DNA-templated transcription DNA binding … │ Acts as a mediator between the cap-binding complex (CBC) and the primary microR… │ Nucleus │ Acts as a mediator between the cap-binding complex │ │ ENSG00000087502 │ ERGIC and golgi 2 │ retrograde vesicle-mediated transport, Golgi to endoplasmic reticulum transport… │ Possible role in transport between endoplasmic reticulum and Golgi. . │ Endoplasmic reticulum │ Possible role in transport between endoplasmic ret │ │ ENSG00000092201 │ SPT16 homolog, facilitates chromatin remodeling subunit │ nucleoplasm nucleoplasm FACT complex RNA binding nucleoplasm nucleoplasm transc… │ Component of the FACT complex, a general chromatin factor that acts to reorgani… │ Nucleus │ Component of the FACT complex, a general chromatin │ │ ENSG00000102078 │ solute carrier family 25 member 14 │ plasma membrane mitochondrial inner membrane mitochondrial inner membrane mitoc… │ Participates in the mitochondrial proton leak measured in brain mitochondria. │ Mitochondrion inner membrane │ Participates in the mitochondrial proton leak meas │ │ … │ … │ … │ … │ … │ … │ └─────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────┴────────────────────────────────────────────────────┘
df = query.execute()
Other notes:
ensembl_data = EnsemblRelease(77)
def retrieve_transcript_sequence(i):
try:
contig = ensembl_data.gene_by_id(i)
sequence = contig.transcripts[0].sequence # only use the first transcript
contig_num = contig.transcripts[0].contig
contig_num = contig_num if contig_num.isnumeric() else "Other"
return (sequence, contig_num)
except ValueError:
return ("N/A", "N/A")
def do_translate(nucleotide_seq):
try:
return translate(nucleotide_seq)
except: # I'm lazy
return "N/A"
# Retrieve nucletide sequences and filter out IDs that we could not retrieve
df["nucleotide"], df["contig"] = zip(*[retrieve_transcript_sequence(i) for i in df["id"]])
df = df[df["nucleotide"] != "N/A"]
# Translate into proteins and filter out N/A (not perfect; has some issues because not all have a complete reading frame, so there ae some stops)
# It at least gives us something to embed with for now!
df["protein"] = [do_translate(i) for i in df["nucleotide"]]
df = df[df["protein"] != "N/A"]
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/jordanramsdell/Library/Caches/pyensembl/GRCh38/ensembl77/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/jordanramsdell/Library/Caches/pyensembl/GRCh38/ensembl77/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
# Adding index (for debugging purposes)
df["index"] = list(range(len(df)))
gene_desc_mapper = vectorize_and_embed(df["functionDescriptions"],
metric='hellinger',
vectorizer_fun=lambda: TfidfVectorizer(stop_words='english', norm='l1', max_features=1000))
construct_scatterplot(df, gene_desc_mapper, hover_name="approvedName",
color="location", hover_data=["id", "contig", "truncDesc", "index"])
gene_go_mapper = vectorize_and_embed(df["go_desc"], metric='hellinger',
vectorizer_fun=lambda: TfidfVectorizer(norm="l1"))
construct_scatterplot(df, gene_go_mapper, hover_name="approvedName",
color="location", hover_data=["id", "contig", "truncDesc", "index"])
gene_nucleotide_mapper = vectorize_and_embed(df["nucleotide"], metric='hellinger',
vectorizer_fun=lambda: TfidfVectorizer(analyzer="char", norm="l1",
ngram_range=(1, 3),
lowercase=False))
construct_scatterplot(df, gene_nucleotide_mapper, hover_name="approvedName",
color="location", hover_data=["id", "contig", "truncDesc", "index"])
gene_protein_mapper = vectorize_and_embed(df["protein"], metric='hellinger',
vectorizer_fun=lambda: TfidfVectorizer(analyzer="char",
norm="l1",
ngram_range=(1, 2),
lowercase=False))
construct_scatterplot(df, gene_protein_mapper, hover_name="approvedName",
color="location", hover_data=["id", "contig", "truncDesc", "index"])
pickle.dump(df, open("models/gene_df.sav", 'wb'))
pickle.dump(gene_desc_mapper, open("models/gene_desc_mapper.sav", 'wb'))
pickle.dump(gene_go_mapper, open("models/gene_go_mapper.sav", 'wb'))
pickle.dump(gene_nucleotide_mapper, open("models/gene_nucleotide_mapper.sav", 'wb'))
pickle.dump(gene_protein_mapper, open("models/gene_protein_mapper.sav", 'wb'))